CanSim Package Introduction

A new post by hswerdfe

Howard Swerdfeger https://hswerdfe.github.io/docs/
2023-03-20

Share on,

StatsCan releases a Ton of data tables, which are readily available and easy to download. In the R world the cansim package can be used a wrapper around these data.

Firstly the cansim package can be installed with:

 install.packages('cansim')

Then Loaded with:

Once Loaded we can find a data table of interest by searching the data table of interest by keyword. In this case we use farm income :

search_result <- 
  cansim::search_cansim_cubes(search_term) |>
  dplyr::mutate(date_length  = difftime(cubeEndDate,cubeStartDate )) |> 
  dplyr::filter(cubeEndDate > '2020-01-01') |> 
  dplyr::arrange(dplyr::desc(date_length)) 


cansimId <- search_result$cansimId[[1]]

search_result |> 
  utils::head(1) |>
  dplyr::mutate_all(as.character)  |> 
  tidyr::pivot_longer(tidyr::everything())  |> 
  kableExtra::kable()
name value
cansim_table_number 32-10-0052
cubeTitleEn Net farm income
cubeTitleFr Revenu agricole net
productId 32100052
cansimId 002-0009
cubeStartDate 1926-01-01
cubeEndDate 2021-01-01
releaseTime 2022-11-28 13:30:00
archived FALSE
subjectCode 32
surveyCode 3473
frequencyCode 12
corrections
dimensionNameEn Geography, Income components
dimensionNameFr Géographie, Catégories de revenus
surveyEn Net Farm Income
surveyFr Revenu agricole net
subjectEn Agriculture and food
subjectFr Agriculture and food
date_length 34699

Using the Cansim ID given 002-0009 we can Meta Information about the data table with :

meta_raw <- cansim::get_cansim_cube_metadata(cansimId) |> janitor::clean_names()

meta_raw |> 
  utils::head(1) |>
  dplyr::mutate_all(as.character)  |> 
  tidyr::pivot_longer(tidyr::everything())  |> 
  kableExtra::kable()
name value
product_id 32-10-0052
cansim_id 002-0009
cube_title_en Net farm income
cube_title_fr Revenu agricole net
cube_start_date 1926-01-01
cube_end_date 2021-01-01
nb_series_cube 138
nb_datapoints_cube 8309
archive_status_code 2
archive_status_en CURRENT - a cube available to the public and that is current
archive_status_fr ACTIF - un cube qui est disponible au public et qui est toujours mise a jour
subject_code 320204
survey_code 3473
dimension 1,Geography,Géographie,FALSE,1,Canada,Canada,11124,1,0,2016,0,2,1,Newfoundland and Labrador,Terre-Neuve-et-Labrador,10,1,2,2016,0,3,1,Prince Edward Island,Île-du-Prince-Édouard,11,1,2,2016,0,4,1,Nova Scotia,Nouvelle-Écosse,12,1,2,2016,0,5,1,New Brunswick,Nouveau-Brunswick,13,1,2,2016,0,6,1,Quebec,Québec,24,1,2,2016,0,7,1,Ontario,Ontario,35,1,2,2016,0,8,1,Manitoba,Manitoba,46,1,2,2016,0,9,1,Saskatchewan,Saskatchewan,47,1,2,2016,0,10,1,Alberta,Alberta,48,1,2,2016,0,11,1,British Columbia,Colombie-Britannique,59,1,2,2016,0,2,Income components,Catégories de revenus,TRUE,10,Cash receipts, total,Recettes monétaires totales,0,81,11,Operating expenses after rebates,Dépenses d’exploitation après remise,0,81,12,11,Net cash income,Revenu net comptant,0,81,5,Income-in-kind,Revenu en nature,0,81,13,Depreciation charges,Dépenses d’amortissement,0,81,8,13,Realized net income,Revenu net réalisé,0,81,1,Gross income, total,Revenu brut, total,1,81,2,Value of inventory change,Valeur de la variation des stocks,0,81,9,2,Net income, total,Revenu net total,0,81,3,2,Realized gross income,Revenu brut réalisé,1,81,4,3,Cash receipts from farm products,Recettes monétaires provenant des produits agricoles,1,81,6,3,Supplementary payments,Paiements supplémentaires,1,81,7,Operating expenses and depreciation charges,Dépenses d’exploitation et d’amortissement,1,81
release_time 2022-12-06 11:00:00

And the actual data with:

dat_raw <- cansim::get_cansim(cansimId) |> 
  janitor::clean_names()

dat <- 
  dat_raw %>% 
  dplyr::mutate_if(~(purrr::is_character(.) & dplyr::n_distinct(.) <=50), as.factor)

skimr::skim(dat)
Table 1: Data summary
Name dat
Number of rows 8309
Number of columns 21
_______________________
Column type frequency:
character 3
Date 1
factor 15
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ref_date 0 1 4 4 0 96 0
vector 0 1 5 5 0 138 0
coordinate 0 1 3 5 0 138 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 1926-07-01 2021-07-01 1974-07-01 96

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
geo 0 1.00 FALSE 11 Nov: 813, Can: 800, Sas: 800, Alb: 799
dguid 0 1.00 FALSE 11 201: 813, 201: 800, 201: 800, 201: 799
uom 0 1.00 FALSE 1 Dol: 8309
uom_id 0 1.00 FALSE 1 81: 8309
scalar_factor 0 1.00 FALSE 1 tho: 8309
scalar_id 0 1.00 FALSE 1 3: 8309
status 8309 0.00 FALSE 0 :
symbol 8309 0.00 FALSE 0 :
terminated 6288 0.24 FALSE 1 t: 2021
decimals 0 1.00 FALSE 1 0: 8309
geo_uid 0 1.00 FALSE 11 12: 813, 111: 800, 47: 800, 46: 799
hierarchy_for_geo 0 1.00 FALSE 11 1.4: 813, 1: 800, 1.9: 800, 1.1: 799
classification_code_for_income_components 8309 0.00 FALSE 0 :
hierarchy_for_income_components 0 1.00 FALSE 13 13.: 1011, 2: 1011, 2.9: 1011, 5: 1011
income_components 0 1.00 FALSE 13 Val: 1011, Net: 1011, Inc: 1011, Rea: 1011

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
value 0 1 1044984 4060323 -7586081 11909 76265 506988 83190530 ▇▁▁▁▁
val_norm 0 1 1044983702 4060323043 -7586081000 11909000 76265000 506988000 83190530000 ▇▁▁▁▁

We can see that there are several Income Components that we can look at:

  dat |>
  count(hierarchy_for_income_components, income_components) |>
  tidyr::separate(col = hierarchy_for_income_components, into = paste0('h_', 1:3), convert = TRUE, remove  = FALSE) |>
  select(-n) |> 
  rename(Hierarchy := hierarchy_for_income_components) |>
  dplyr::arrange(h_1, h_2, h_3) |> 
  kableExtra::kable()
Hierarchy h_1 h_2 h_3 income_components
1 1 NA NA Gross income, total
2.3.4 2 3 4 Cash receipts from farm products
2.3.6 2 3 6 Supplementary payments
2.3 2 3 NA Realized gross income
2.9 2 9 NA Net income, total
2 2 NA NA Value of inventory change
5 5 NA NA Income-in-kind
7 7 NA NA Operating expenses and depreciation charges
10 10 NA NA Cash receipts, total
11.12 11 12 NA Net cash income
11 11 NA NA Operating expenses after rebates
13.8 13 8 NA Realized net income
13 13 NA NA Depreciation charges

We can easily look at the time series of Value of inventory change

p_d <- 
  dat |>
  dplyr::filter(geo == 'Canada') |>
  dplyr::select(date, income_components, value, hierarchy_for_income_components) |>
  #dplyr::count(hierarchy_for_income_components, income_components)
  tidyr::separate(col = hierarchy_for_income_components, into = paste0('h_', 1:3), convert = TRUE) |>
  dplyr::arrange(date, h_1, h_2, h_3) |>
  dplyr::filter(h_1 == 2) #|>
  #dplyr::filter(is.na(h_3)) 


theme_set(theme_minimal() +   
        theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        plot.title=element_text(size=25, hjust=0.5, face="bold", colour="grey", vjust=-1),
        plot.subtitle = element_text(size=15, hjust=0.5, face="bold", colour="grey", vjust=-1),        
        plot.caption =element_text(size=18, hjust=0.5, face="italic", color="black")
        ))


p_d_ends <- 
  p_d |> 
  group_by(income_components) |>
  #filter(date == max(date) | date == min(date) | value == min(value) | value == max(value))  |>
  filter(date == max(date) | date == min(date) )  |>
  mutate(lbl = glue::glue('{date}\n${format(round(value),  big.mark   = ",", digits = 3)}'))
  #mutate(lbl = glue::glue('{income_components}\n{date}\n${format(round(value),  big.mark = ",", digits = 3)}'))


mid_range_value <- function(x){
  mean(c(max(x), min(x)))
}

p_d_lbl <- 
  p_d |> 
  group_by(income_components) |>
  summarise(value = mid_range_value(value), date = mid_range_value(date))
  
p_d_hline <- 
  p_d |> 
  group_by(income_components) |>
  filter(max(value) > 0 & min(value) < 0) |>
  ungroup()


p <- 
  p_d |> 
  ggplot(aes(x = date, y = value, color = income_components)) +
  geom_text(data = p_d_lbl, mapping = aes(label = income_components), alpha = 0.5, size = 7) +
  geom_point() +
  geom_smooth() +
  geom_text(data = p_d_ends, mapping = aes(label = lbl, fill = income_components), alpha = 0.5) +
  #ggplot2::scale_y_log10(labels = scales::dollar_format(prefix="$")) +
  #ggplot2::scale_x_continuous() + 
  ggplot2::scale_x_date(date_breaks = '10 year', date_labels = '%Y') +
  ggplot2::guides(color = FALSE, fill= FALSE) +
  ggplot2::geom_hline(data = p_d_hline, mapping = aes(yintercept = 0), color = 'red', linetype = "dashed") +
  ggplot2::facet_wrap(~income_components, scales = 'free_y', ncol = 1) +
  labs(x = '', y = '', title = meta_raw$cube_title_en) +
  theme(
  strip.background = element_blank(),
  strip.text.x = element_blank()
  )
#p
ggplotly(p)

We do a quick search for a income_components that has a large geographical variation we find

simple_models <- 
  dat |> 
  group_by(income_components, geo) |> 
  group_modify(~ broom::tidy(lm(value ~ date, data = .x))) |>
  filter(term == 'date') |>
  ungroup() 


variation_across_geo <-
  simple_models |> 
  group_by(income_components) |> 
  summarise(estimate_sd = sd(estimate),
            estimate_rng = max(estimate)- min(estimate)) |> 
  arrange(desc(estimate_sd))


variation_across_geo |>
    kableExtra::kable()
income_components estimate_sd estimate_rng
Cash receipts, total 971.9935522 3390.820929
Operating expenses after rebates 780.2543943 2725.725726
Net cash income 193.2701312 665.095205
Depreciation charges 112.3889962 391.986964
Gross income, total 76.4585980 254.544545
Realized gross income 74.4456463 247.790012
Cash receipts from farm products 73.6189327 244.950606
Operating expenses and depreciation charges 50.5067495 166.907665
Realized net income 47.6196352 161.210619
Net income, total 47.3885855 160.854911
Supplementary payments 2.0176029 7.045422
Income-in-kind 0.8948958 3.134577
Value of inventory change 0.7301864 2.571054

Take the measurement with the most difference deviation across geo codes, this would be Cash receipts, total

library(geofacet)
income_components_curr <- variation_across_geo |> head(1) |> pull(income_components)

geo_est <- 
  simple_models |>
  filter(income_components == income_components_curr) |>
  select(geo, estimate) |>
  arrange(desc(estimate))
  



p_d <- 
  dat |>
  filter(income_components == income_components_curr) |>
  select(date, income_components, value, geo) |>
  left_join(geo_est, by = join_by(geo)) |>
  filter(geo != 'Canada')  |>
  mutate(geo = fct_reorder(geo, -estimate)) 


  


p_d_lbl <- 
  p_d |> 
  group_by(geo) |>
  summarise(value = mid_range_value(value), date = mid_range_value(date), estimate = unique(estimate)) |>
  left_join(
          ca_prov_grid1 |>
          mutate(name = factor(x = name,levels = levels(p_d$geo))) |> 
          as_tibble() |>
          filter(!is.na(name)), 
          by = join_by(geo == name )
        ) |>
  mutate(lbl = code)
  #mutate(lbl = paste0(code, '(', round(estimate, 0),')'))

  



p_d_ends <- 
  p_d |> 
  group_by(geo) |>
  #filter(date == max(date) | date == min(date) | value == min(value) | value == max(value))  |>
  filter(date == max(date) | date == min(date) )  |>
  mutate(lbl = glue::glue('{date}\n${format(round(value),  big.mark   = ",", digits = 3)}'))
  #mutate(lbl = glue::glue('{income_components}\n{date}\n${format(round(value),  big.mark = ",", digits = 3)}'))


p <- 
p_d |> 
  ggplot(aes(x = date, y = value, color = geo)) +
  geom_point() + 
  geom_smooth() + 
  geom_text(data = p_d_lbl, mapping = aes(label = lbl), alpha = 0.5, size = 12) +
  geom_text(data = p_d_ends, mapping = aes(label = lbl), alpha = 0.5) +
  # facet_geo(~ geo, 
  #           grid = "ca_prov_grid1"#, 
  #           #scales = 'free_y' 
  #           ) +
  facet_wrap(
    ~ geo, ncol = 3
  ) +
  guides(fill= FALSE, color = FALSE) +
  labs(x='', y = '', title = income_components_curr) +
  theme(
  strip.background = element_blank(),
  strip.text.x = element_blank()
  )

ggplotly(p)